In [479]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [480]:
%matplotlib inline

Warm Up


In [481]:
samples = np.random.normal(3, 1, size=10000)

In [482]:
bins = int(np.sqrt(len(samples)))
_ = plt.hist(samples, bins=bins, normed=True, cumulative=False)



In [483]:
bins = int(np.sqrt(len(samples)))
_ = plt.hist(samples, bins=bins, normed=True, cumulative=True)


Baseball - eda, inferences


In [484]:
df = pd.read_csv('https://assets.datacamp.com/production/course_1550/datasets/mlb_nohitters.csv', parse_dates=True, header=0)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 5 columns):
date               294 non-null int64
game_number        294 non-null int64
winning_team       228 non-null object
losing_team        228 non-null object
winning_pitcher    228 non-null object
dtypes: int64(2), object(3)
memory usage: 11.6+ KB

In [485]:
df.head()


Out[485]:
date game_number winning_team losing_team winning_pitcher
0 18760715 140 NaN NaN NaN
1 18800612 1035 NaN NaN NaN
2 18800617 1046 NaN NaN NaN
3 18800819 1177 NaN NaN NaN
4 18800820 1179 NaN NaN NaN

In [486]:
df.date = pd.to_datetime(df.date,format='%Y%m%d')
df.winning_team = df.winning_team.astype('category')
df.losing_team = df.losing_team.astype('category')

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 5 columns):
date               294 non-null datetime64[ns]
game_number        294 non-null int64
winning_team       228 non-null category
losing_team        228 non-null category
winning_pitcher    228 non-null object
dtypes: category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 10.8+ KB

In [487]:
df.head()


Out[487]:
date game_number winning_team losing_team winning_pitcher
0 1876-07-15 140 NaN NaN NaN
1 1880-06-12 1035 NaN NaN NaN
2 1880-06-17 1046 NaN NaN NaN
3 1880-08-19 1177 NaN NaN NaN
4 1880-08-20 1179 NaN NaN NaN

In [488]:
d1 = df[['date']].reset_index()
d1['index'] = d1['index'] + 1

# drop last row
d1 = d1[:-1].set_index('index')

d1_m = df.merge(d1, how='left', left_index=True, right_index=True, suffixes=('_c','_b'))
d1_m['days_between'] = pd.Series(d1_m['date_c'] - d1_m['date_b'])
d1_m.days_between = d1_m.days_between.fillna(pd.Timedelta('0 days')).map(lambda s: s.days)

d1_m = d1_m.drop(columns=['date_b'])
d1_m = d1_m.rename(columns={'date_c':'date'}).set_index('date')

# overwrite
df = d1_m

d1_m.head()


Out[488]:
game_number winning_team losing_team winning_pitcher days_between
date
1876-07-15 140 NaN NaN NaN 0
1880-06-12 1035 NaN NaN NaN 1428
1880-06-17 1046 NaN NaN NaN 5
1880-08-19 1177 NaN NaN NaN 63
1880-08-20 1179 NaN NaN NaN 1

In [489]:
n_ = len(df['days_between'])
x_ = np.sort(df['days_between'])
y_ = np.arange(1,n_+1) / n_

In [490]:
# team to have most nohitters
data = df.groupby('winning_team')[['game_number']].count().sort_values(by='game_number', ascending=False).reset_index()
data = data.rename(columns={'game_number':'no hitters', 'winning_team':'winning team'})

data.plot(kind='bar', x='winning team', y='no hitters')
plt.title('Teams with most no hitters')
plt.tight_layout()



In [491]:
# team to have most no hitter loses
data = df.groupby('losing_team')[['game_number']].count().sort_values(by='game_number', ascending=False).reset_index()
data = data.rename(columns={'game_number':'no hitters', 'losing_team':'losing team'})

data.plot(kind='bar', x='losing team', y='no hitters')
plt.title('Teams with most loses')
plt.tight_layout()


Fit 1.


In [492]:
mu = df.days_between.mean()
std = df.days_between.std()

In [493]:
normal_samples = np.random.normal(mu, std, size=10000)

_ = plt.hist(df.days_between, bins=50, normed=True, alpha=.3, label='Real')
_ = plt.hist(normal_samples, bins=50, normed=True, histtype='step', color='black', label='Theoretical')

plt.legend()
plt.margins(0.02)
plt.show()



In [494]:
nt_ = len(normal_samples)
xt_ = np.sort(normal_samples)
yt_ = np.arange(1,nt_+1) / nt_

_ = plt.plot(x_, y_, marker='.', linestyle='none', label='Real')
_ = plt.plot(xt_, yt_, alpha=.5, label='Theoretical')

plt.legend(loc='lower right')
plt.margins(0.02)
plt.show()



In [495]:
# awful...

Fit 2


In [496]:
df.days_between.mean()


Out[496]:
172.95238095238096

In [497]:
tau = df.days_between.mean()
exp_samples = np.random.exponential(tau, size=10000)

_ = plt.hist(df.days_between, bins=50, normed=True, alpha=.3, label='Real')
_ = plt.hist(exp_samples, bins=50, normed=True, histtype='step', color='black', label='Theoretical')

plt.legend()
plt.margins(0.02)



In [498]:
nt_ = len(exp_samples)
xt_ = np.sort(exp_samples)
yt_ = np.arange(1,nt_+1) / nt_

_ = plt.plot(x_, y_, marker='.', linestyle='none', label='Real')
_ = plt.plot(xt_, yt_, alpha=.5, label='Theoretical')

plt.legend(loc='lower right')
plt.margins(0.02)
plt.show()



In [499]:
# exp. makes more sense...

Sampling


In [500]:
# standard error of the mean, std.
sem = np.std(df.days_between) / np.sqrt(len(df.days_between))

print('population')
print('sem:', sem)
print('mu:', np.mean(df.days_between))


population
sem: 13.186171308
mu: 172.95238095238096

In [501]:
# sampling 
samples = []
size = len(df.days_between)
for i in range(10000):
    samples.append(np.random.choice(df.days_between, size=size).mean())

s_mu = np.mean(samples)
s_std = np.std(samples)

print('sampling')
print('std:', s_std)
print('mean:', s_mu)


sampling
std: 13.2079910696
mean: 173.114581973

In [502]:
np.percentile(samples, [2.5,97.5])


Out[502]:
array([ 148.06751701,  199.82653061])

In [503]:
bins = int(np.sqrt(len(samples)))

_ = plt.hist(samples, normed=True, bins=bins)